employees <- read.csv("CaseStudy2-data.csv")
str(employees)
## 'data.frame': 870 obs. of 36 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Age : int 32 40 35 32 24 27 41 37 34 34 ...
## $ Attrition : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ BusinessTravel : Factor w/ 3 levels "Non-Travel","Travel_Frequently",..: 3 3 2 3 2 2 3 3 3 2 ...
## $ DailyRate : int 117 1308 200 801 567 294 1283 309 1333 653 ...
## $ Department : Factor w/ 3 levels "Human Resources",..: 3 2 2 3 2 2 2 3 3 2 ...
## $ DistanceFromHome : int 13 14 18 1 2 10 5 10 10 10 ...
## $ Education : int 4 3 2 4 1 2 5 4 4 4 ...
## $ EducationField : Factor w/ 6 levels "Human Resources",..: 2 4 2 3 6 2 4 2 2 6 ...
## $ EmployeeCount : int 1 1 1 1 1 1 1 1 1 1 ...
## $ EmployeeNumber : int 859 1128 1412 2016 1646 733 1448 1105 1055 1597 ...
## $ EnvironmentSatisfaction : int 2 3 3 3 1 4 2 4 3 4 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 2 2 2 1 1 2 2 1 1 2 ...
## $ HourlyRate : int 73 44 60 48 32 32 90 88 87 92 ...
## $ JobInvolvement : int 3 2 3 3 3 3 4 2 3 2 ...
## $ JobLevel : int 2 5 3 3 1 3 1 2 1 2 ...
## $ JobRole : Factor w/ 9 levels "Healthcare Representative",..: 8 6 5 8 7 5 7 8 9 1 ...
## $ JobSatisfaction : int 4 3 4 4 4 1 3 4 3 3 ...
## $ MaritalStatus : Factor w/ 3 levels "Divorced","Married",..: 1 3 3 2 3 1 2 1 2 2 ...
## $ MonthlyIncome : int 4403 19626 9362 10422 3760 8793 2127 6694 2220 5063 ...
## $ MonthlyRate : int 9250 17544 19944 24032 17218 4809 5561 24223 18410 15332 ...
## $ NumCompaniesWorked : int 2 1 2 1 1 1 2 2 1 1 ...
## $ Over18 : Factor w/ 1 level "Y": 1 1 1 1 1 1 1 1 1 1 ...
## $ OverTime : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 1 2 2 2 1 ...
## $ PercentSalaryHike : int 11 14 11 19 13 21 12 14 19 14 ...
## $ PerformanceRating : int 3 3 3 3 3 4 3 3 3 3 ...
## $ RelationshipSatisfaction: int 3 1 3 3 3 3 1 3 4 2 ...
## $ StandardHours : int 80 80 80 80 80 80 80 80 80 80 ...
## $ StockOptionLevel : int 1 0 0 2 0 2 0 3 1 1 ...
## $ TotalWorkingYears : int 8 21 10 14 6 9 7 8 1 8 ...
## $ TrainingTimesLastYear : int 3 2 2 3 2 4 5 5 2 3 ...
## $ WorkLifeBalance : int 2 4 3 3 3 2 2 3 3 2 ...
## $ YearsAtCompany : int 5 20 2 14 6 9 4 1 1 8 ...
## $ YearsInCurrentRole : int 2 7 2 10 3 7 2 0 1 2 ...
## $ YearsSinceLastPromotion : int 0 4 2 5 1 1 0 0 0 7 ...
## $ YearsWithCurrManager : int 3 9 2 7 3 7 3 0 0 7 ...
colSums(is.na(employees))
## ID Age Attrition
## 0 0 0
## BusinessTravel DailyRate Department
## 0 0 0
## DistanceFromHome Education EducationField
## 0 0 0
## EmployeeCount EmployeeNumber EnvironmentSatisfaction
## 0 0 0
## Gender HourlyRate JobInvolvement
## 0 0 0
## JobLevel JobRole JobSatisfaction
## 0 0 0
## MaritalStatus MonthlyIncome MonthlyRate
## 0 0 0
## NumCompaniesWorked Over18 OverTime
## 0 0 0
## PercentSalaryHike PerformanceRating RelationshipSatisfaction
## 0 0 0
## StandardHours StockOptionLevel TotalWorkingYears
## 0 0 0
## TrainingTimesLastYear WorkLifeBalance YearsAtCompany
## 0 0 0
## YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
## 0 0 0
summary(employees$MonthlyIncome)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1081 2840 4946 6390 8182 19999
summary(employees$Attrition)
## No Yes
## 730 140
Are any ordinal variables going in as continuous right now? yes
employees$JobInvolvement[which(employees$JobInvolvement == 1)] = 'Low'
employees$JobInvolvement[which(employees$JobInvolvement == 2)] = 'Medium'
employees$JobInvolvement[which(employees$JobInvolvement == 3)] = 'High'
employees$JobInvolvement[which(employees$JobInvolvement == 4)] = 'Very High'
employees$JobInvolvement = as.factor(employees$JobInvolvement)
summary(employees$JobInvolvement)
## High Low Medium Very High
## 514 47 228 81
employees$JobSatisfaction[which(employees$JobSatisfaction == 1)] = 'Low'
employees$JobSatisfaction[which(employees$JobSatisfaction == 2)] = 'Medium'
employees$JobSatisfaction[which(employees$JobSatisfaction == 3)] = 'High'
employees$JobSatisfaction[which(employees$JobSatisfaction == 4)] = 'Very High'
employees$JobSatisfaction = as.factor(employees$JobSatisfaction)
summary(employees$JobSatisfaction)
## High Low Medium Very High
## 254 179 166 271
employees$PerformanceRating[which(employees$PerformanceRating == 1)] = 'Low'
employees$PerformanceRating[which(employees$PerformanceRating == 2)] = 'Good'
employees$PerformanceRating[which(employees$PerformanceRating == 3)] = 'Excellent'
employees$PerformanceRating[which(employees$PerformanceRating == 4)] = 'Outstanding'
employees$PerformanceRating = as.factor(employees$PerformanceRating)
summary(employees$PerformanceRating)
## Excellent Outstanding
## 738 132
employees$RelationshipSatisfaction[which(employees$RelationshipSatisfaction == 1)] = 'Low'
employees$RelationshipSatisfaction[which(employees$RelationshipSatisfaction == 2)] = 'Medium'
employees$RelationshipSatisfaction[which(employees$RelationshipSatisfaction == 3)] = 'High'
employees$RelationshipSatisfaction[which(employees$RelationshipSatisfaction == 4)] = 'Very High'
employees$RelationshipSatisfaction = as.factor(employees$RelationshipSatisfaction)
summary(employees$RelationshipSatisfaction)
## High Low Medium Very High
## 261 174 171 264
employees$WorkLifeBalance[which(employees$WorkLifeBalance == 1)] = 'Bad'
employees$WorkLifeBalance[which(employees$WorkLifeBalance == 2)] = 'Good'
employees$WorkLifeBalance[which(employees$WorkLifeBalance == 3)] = 'Better'
employees$WorkLifeBalance[which(employees$WorkLifeBalance == 4)] = 'Best'
employees$WorkLifeBalance = as.factor(employees$WorkLifeBalance)
summary(employees$WorkLifeBalance)
## Bad Best Better Good
## 48 98 532 192
Highest job satisfaction vs education fields
library(ggmosaic)
ggplot(data = employees) +
geom_mosaic(aes(x = product(JobSatisfaction, EducationField), fill=EducationField), na.rm=TRUE) + labs(x = "Education Field", title='f(JobSatisfaction | Education Field)', y='Job Satisfaction')
#Chi square says no significant difference in job satisfaction across education fields
table(employees$JobSatisfaction, employees$EducationField)
##
## Human Resources Life Sciences Marketing Medical Other
## High 4 103 27 80 14
## Low 3 75 21 52 11
## Medium 5 61 21 56 9
## Very High 3 119 31 82 18
##
## Technical Degree
## High 26
## Low 17
## Medium 14
## Very High 18
chisq.test(table(employees$JobSatisfaction, employees$EducationField))
## Warning in chisq.test(table(employees$JobSatisfaction,
## employees$EducationField)): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: table(employees$JobSatisfaction, employees$EducationField)
## X-squared = 7.1692, df = 15, p-value = 0.9528
What continuous variables have a difference in distribution of the response? pairs plots continuous
library(GGally)
#there's 22 continuous variables
employees %>%
select_if(is.numeric) %>%
dim()
## [1] 870 22
employees %>%
select_if(is.numeric) %>%
select(1:5) %>%
mutate(Attrition = employees$Attrition) %>%
sample_n(200) %>%
ggpairs(aes(colour = Attrition)) +
ggtitle("Pairs Plot")
employees %>%
select_if(is.numeric) %>%
select(6:10) %>%
mutate(Attrition = employees$Attrition) %>%
sample_n(200) %>%
ggpairs(aes(colour = Attrition)) +
ggtitle("Pairs Plot")
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
employees %>%
select_if(is.numeric) %>%
select(11:16) %>%
mutate(Attrition = employees$Attrition) %>%
sample_n(200) %>%
ggpairs(aes(colour = Attrition)) +
ggtitle("Pairs Plot")
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
employees %>%
select_if(is.numeric) %>%
select(17:22) %>%
mutate(Attrition = employees$Attrition) %>%
sample_n(200) %>%
ggpairs(aes(colour = Attrition)) +
ggtitle("Pairs Plot")
Check for multicollinearity between continuous variables
library(corrplot)
## corrplot 0.84 loaded
## corrplot 0.84 loaded
M <- employees %>%
select_if(is.numeric) %>%
cor()
## Warning in cor(.): the standard deviation is zero
corrplot(M, method = "circle")
What categorical variables have a difference in distribution in response?
#there's 14 categorical variables
employees %>%
select_if(is.factor) %>%
dim()
## [1] 870 14
categs <- names(select_if(employees, is.factor))
for(i in 1:length(categs)){
print(employees %>%
ggplot(aes(x = eval(parse(text=categs[i])), fill = Attrition)) +
geom_bar(position = "fill") +
xlab(categs[i])
)
}
setting up train test split
split_train_test <- function(df) {
# dataset with "no"
data_no = df[which(df$Attrition=="No"),]
# dataset with "yes"
data_yes = df[which(df$Attrition=="Yes"),]
#making more folds on No to balance the number with Yes
folds_no = createFolds(data_no$Attrition, k=8)
folds_yes = createFolds(data_yes$Attrition, k=2)
length(folds_no$Fold1)
length(folds_no$Fold2)
length(folds_yes$Fold1)
length(folds_yes$Fold2)
#Train
train_no = data_no[folds_no$Fold1,]
train_yes = data_yes[folds_yes$Fold1,]
train = rbind(train_no, train_yes)
#Test
test_no = data_no[c(folds_no$Fold2, folds_no$Fold3, folds_no$Fold4, folds_no$Fold5),]
test_yes = data_yes[folds_yes$Fold2,]
test = rbind(test_no, test_yes)
return(list(train, test))
}
Predict attrition - Naive Bayes
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(e1071)
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library(plotROC)
#naive bayes
#balanced training set
train1 <- split_train_test(employees)[[1]]
test1 <- split_train_test(employees)[[2]]
model.nb1 <- naiveBayes(Attrition ~ ., data=train1)
preds.nb1 <- predict(model.nb1, test1)
confusionMatrix(table(preds.nb1, test1$Attrition))
## Confusion Matrix and Statistics
##
##
## preds.nb1 No Yes
## No 263 17
## Yes 102 53
##
## Accuracy : 0.7264
## 95% CI : (0.6819, 0.7678)
## No Information Rate : 0.8391
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.3204
##
## Mcnemar's Test P-Value : 1.358e-14
##
## Sensitivity : 0.7205
## Specificity : 0.7571
## Pos Pred Value : 0.9393
## Neg Pred Value : 0.3419
## Prevalence : 0.8391
## Detection Rate : 0.6046
## Detection Prevalence : 0.6437
## Balanced Accuracy : 0.7388
##
## 'Positive' Class : No
##
preds.nb1 <- predict(model.nb1, test1, type = "raw")
preds.nb1 <- prediction(preds.nb1[,2], test1$Attrition)
roc.perf_1 = performance(preds.nb1, measure = "tpr", x.measure = "fpr")
#unbalanced training set
folds <- createFolds(employees$Attrition, k=2)
train2 <- employees[folds$Fold1,]
test2 <- employees[folds$Fold2,]
model.nb2 <- naiveBayes(Attrition ~ ., data=train2)
preds.nb2 <- predict(model.nb2, test2)
confusionMatrix(table(preds.nb2, test2$Attrition))
## Confusion Matrix and Statistics
##
##
## preds.nb2 No Yes
## No 306 26
## Yes 59 44
##
## Accuracy : 0.8046
## 95% CI : (0.7642, 0.8408)
## No Information Rate : 0.8391
## P-Value [Acc > NIR] : 0.9762369
##
## Kappa : 0.3922
##
## Mcnemar's Test P-Value : 0.0005187
##
## Sensitivity : 0.8384
## Specificity : 0.6286
## Pos Pred Value : 0.9217
## Neg Pred Value : 0.4272
## Prevalence : 0.8391
## Detection Rate : 0.7034
## Detection Prevalence : 0.7632
## Balanced Accuracy : 0.7335
##
## 'Positive' Class : No
##
preds.nb2 <- predict(model.nb2, test2, type = "raw")
preds.nb2 <- prediction(preds.nb2[,2], test2$Attrition)
roc.perf_2 = performance(preds.nb2, measure = "tpr", x.measure = "fpr")
#the balanced one definitely looks better and .25 seems a good threshold
plot(roc.perf_1, col="red")
plot(roc.perf_2, add = TRUE, col="blue")
Predict attrition - KNN
#KNN
##load the package class
library(class)
#resetting employees data set to original with ordinal columns because KNN only takes numeric
employees_num <- read.csv("CaseStudy2-data.csv")
employees_num <- employees_num %>%
select_if(is.numeric) %>%
mutate(Attrition = employees_num$Attrition)
train <- split_train_test(employees_num)[[1]]
test <- split_train_test(employees_num)[[2]]
##run knn k=8 and make ROC--it looks really bad
preds.knn <- knn(train[, names(train) != "Attrition"], test[, names(train) != "Attrition"], cl=train$Attrition, k=8, prob=TRUE)
prob.knn <- attr(preds.knn, "prob")
preds.knn <- prediction(prob.knn, test$Attrition)
roc.perf_knn = performance(preds.knn, measure = "tpr", x.measure = "fpr")
auc <- performance(preds.knn, measure = "auc")
auc <- auc@y.values
plot(roc.perf_knn, colorize = TRUE)
auc.knns <- c()
for(i in 1:80) {
#train test split
train <- split_train_test(employees_num)[[1]]
test <- split_train_test(employees_num)[[2]]
##run knn
preds.knn <- knn(train[, names(train) != "Attrition"], test[, names(train) != "Attrition"], cl=train$Attrition, k=i, prob=TRUE)
prob.knn <- attr(preds.knn, "prob")
preds.knn <- prediction(prob.knn, test$Attrition)
roc.perf_knn = performance(preds.knn, measure = "tpr", x.measure = "fpr")
auc <- performance(preds.knn, measure = "auc")
auc <- auc@y.values
auc.knns <- c(auc.knns, auc)
}
plot(x=1:80, y=auc.knns, xlab="k", main = "AUCs of KNN models with k=[1,80]")
#all the KNN models are bad
#add categorical variables
employees_num <- read.csv("CaseStudy2-data.csv")
employees_num$BusinessTravel = as.numeric(employees_num$BusinessTravel)
employees_num$Department = as.numeric(employees_num$Department)
employees_num$EducationField = as.numeric(employees_num$EducationField)
employees_num$Gender = as.numeric(employees_num$Gender)
employees_num$JobRole = as.numeric(employees_num$JobRole)
employees_num$MaritalStatus = as.numeric(employees_num$MaritalStatus)
employees_num$OverTime = as.numeric(employees_num$OverTime)
employees_num <- employees_num %>%
select_if(is.numeric) %>%
mutate(Attrition = employees_num$Attrition)
train <- split_train_test(employees_num)[[1]]
test <- split_train_test(employees_num)[[2]]
train$Attrition = as.character(train$Attrition)
train$Attrition[which(train$Attrition == "No")] = 0
train$Attrition[which(train$Attrition == "Yes")] = 1
train$Attrition = as.numeric(train$Attrition)
test$Attrition = as.character(test$Attrition)
test$Attrition[which(test$Attrition == "No")] = 0
test$Attrition[which(test$Attrition == "Yes")] = 1
test$Attrition = as.numeric(test$Attrition)
str(train)
## 'data.frame': 162 obs. of 35 variables:
## $ ID : int 4 30 42 91 100 104 117 122 124 129 ...
## $ Age : int 32 52 45 38 34 23 25 21 38 43 ...
## $ BusinessTravel : num 3 1 3 2 3 3 3 3 3 3 ...
## $ DailyRate : int 801 715 954 1490 401 885 1356 501 243 930 ...
## $ Department : num 3 2 3 2 2 2 3 3 3 2 ...
## $ DistanceFromHome : int 1 19 2 2 1 4 10 5 7 6 ...
## $ Education : int 4 4 2 2 3 3 4 1 4 3 ...
## $ EducationField : num 3 4 6 2 2 4 2 4 3 4 ...
## $ EmployeeCount : int 1 1 1 1 1 1 1 1 1 1 ...
## $ EmployeeNumber : int 2016 791 783 556 1447 705 1240 2021 709 1402 ...
## $ EnvironmentSatisfaction : int 3 4 2 4 4 1 3 3 4 1 ...
## $ Gender : num 1 2 2 2 1 2 2 2 1 1 ...
## $ HourlyRate : int 48 41 46 42 86 58 57 58 46 73 ...
## $ JobInvolvement : int 3 3 1 3 2 4 3 3 2 2 ...
## $ JobLevel : int 3 1 2 1 1 1 2 1 2 2 ...
## $ JobRole : num 8 7 9 3 3 7 8 9 8 7 ...
## $ JobSatisfaction : int 4 4 3 4 2 1 4 1 4 3 ...
## $ MaritalStatus : num 2 2 3 2 2 2 3 3 3 3 ...
## $ MonthlyIncome : int 10422 4258 6632 1702 3294 2819 4950 2380 4028 4081 ...
## $ MonthlyRate : int 24032 26589 12388 12106 3708 8544 20623 25479 7791 20003 ...
## $ NumCompaniesWorked : int 1 0 0 1 5 2 0 1 0 1 ...
## $ OverTime : num 1 1 1 2 1 1 1 2 1 2 ...
## $ PercentSalaryHike : int 19 18 13 23 17 16 14 11 20 14 ...
## $ PerformanceRating : int 3 3 3 4 3 3 3 3 4 3 ...
## $ RelationshipSatisfaction: int 3 1 1 3 1 1 2 4 1 1 ...
## $ StandardHours : int 80 80 80 80 80 80 80 80 80 80 ...
## $ StockOptionLevel : int 2 1 0 1 1 1 0 0 0 0 ...
## $ TotalWorkingYears : int 14 5 9 1 7 5 5 2 8 20 ...
## $ TrainingTimesLastYear : int 3 3 3 3 2 3 4 6 2 3 ...
## $ WorkLifeBalance : int 3 3 3 3 2 4 3 3 3 1 ...
## $ YearsAtCompany : int 14 4 8 1 5 3 4 2 7 20 ...
## $ YearsInCurrentRole : int 10 3 7 0 4 2 3 2 7 7 ...
## $ YearsSinceLastPromotion : int 5 1 3 0 0 0 1 1 0 1 ...
## $ YearsWithCurrManager : int 7 2 1 0 2 2 1 2 5 8 ...
## $ Attrition : num 0 0 0 0 0 0 0 0 0 0 ...
str(test)
## 'data.frame': 435 obs. of 35 variables:
## $ ID : int 5 6 19 26 32 33 48 76 85 89 ...
## $ Age : int 24 27 34 44 33 46 38 30 36 27 ...
## $ BusinessTravel : num 2 2 3 3 3 3 3 2 3 2 ...
## $ DailyRate : int 567 294 181 1099 267 488 770 721 429 994 ...
## $ Department : num 2 2 2 3 2 3 3 2 2 3 ...
## $ DistanceFromHome : int 2 10 2 5 21 2 10 1 2 8 ...
## $ Education : int 1 2 4 3 3 3 4 2 4 3 ...
## $ EducationField : num 6 2 4 3 4 6 3 4 2 2 ...
## $ EmployeeCount : int 1 1 1 1 1 1 1 1 1 1 ...
## $ EmployeeNumber : int 1646 733 1755 1267 1698 363 1119 57 1294 56 ...
## $ EnvironmentSatisfaction : int 1 4 4 2 2 3 3 3 3 4 ...
## $ Gender : num 1 2 2 2 2 1 2 1 1 2 ...
## $ HourlyRate : int 32 32 97 88 79 75 73 58 53 37 ...
## $ JobInvolvement : int 3 3 4 3 4 1 2 3 3 3 ...
## $ JobLevel : int 1 3 1 5 1 4 3 2 2 3 ...
## $ JobRole : num 7 5 7 4 3 4 8 3 5 8 ...
## $ JobSatisfaction : int 4 1 4 2 2 2 3 4 2 3 ...
## $ MaritalStatus : num 3 1 2 2 2 2 1 3 3 3 ...
## $ MonthlyIncome : int 3760 8793 2932 18213 2028 16872 8740 4011 5410 8726 ...
## $ MonthlyRate : int 17218 4809 5586 8751 13637 14977 5569 10781 2323 2975 ...
## $ NumCompaniesWorked : int 1 1 0 7 1 3 0 1 9 1 ...
## $ OverTime : num 2 1 2 1 1 2 2 1 2 1 ...
## $ PercentSalaryHike : int 13 21 14 11 18 12 14 23 11 15 ...
## $ PerformanceRating : int 3 4 3 3 3 3 3 4 3 3 ...
## $ RelationshipSatisfaction: int 3 3 1 3 4 2 2 4 4 4 ...
## $ StandardHours : int 80 80 80 80 80 80 80 80 80 80 ...
## $ StockOptionLevel : int 0 2 3 1 3 1 2 0 0 0 ...
## $ TotalWorkingYears : int 6 9 6 26 14 28 9 12 18 9 ...
## $ TrainingTimesLastYear : int 2 4 3 5 6 2 2 2 2 0 ...
## $ WorkLifeBalance : int 3 2 3 3 3 2 3 3 3 3 ...
## $ YearsAtCompany : int 6 9 5 22 14 7 8 12 16 9 ...
## $ YearsInCurrentRole : int 3 7 0 9 11 7 7 8 14 8 ...
## $ YearsSinceLastPromotion : int 1 1 1 3 2 7 2 3 5 1 ...
## $ YearsWithCurrManager : int 3 7 2 10 13 7 7 7 12 7 ...
## $ Attrition : num 0 0 0 0 0 0 0 0 0 0 ...
##run knn k=18 and make ROC
preds.knn <- knn(train[, names(train) != "Attrition"], test[, names(train) != "Attrition"], cl=as.factor(train$Attrition), k=18, prob=TRUE)
prob.knn <- attr(preds.knn, "prob")
preds.knn <- prediction(prob.knn, test$Attrition)
roc.perf_knn = performance(preds.knn, measure = "tpr", x.measure = "fpr")
auc <- performance(preds.knn, measure = "auc")
auc <- auc@y.values
auc.knns <- c(auc.knns, auc)
plot(roc.perf_knn, colorize = TRUE)
auc.knns <- c()
for(i in 1:80) {
#train test split
train <- split_train_test(employees_num)[[1]]
test <- split_train_test(employees_num)[[2]]
##plot auc of knn for many ks
preds.knn <- knn(train[, names(train) != "Attrition"], test[, names(train) != "Attrition"], cl=train$Attrition, k=i, prob=TRUE)
prob.knn <- attr(preds.knn, "prob")
preds.knn <- prediction(prob.knn, test$Attrition)
roc.perf_knn = performance(preds.knn, measure = "tpr", x.measure = "fpr")
auc <- performance(preds.knn, measure = "auc")
auc <- auc@y.values
auc.knns <- c(auc.knns, auc)
}
plot(x=1:80, y=auc.knns, xlab="k", main = "AUCs of KNN models with k=[1,80]")
#they're all still bad
#RF
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
train <- split_train_test(employees)[[1]]
test <- split_train_test(employees)[[2]]
model.rf <- randomForest(Attrition ~ ., data = train, importance = TRUE)
preds.rf <- predict(model.rf, test)
confusionMatrix(table(preds.rf, test$Attrition), positive = "Yes")
## Confusion Matrix and Statistics
##
##
## preds.rf No Yes
## No 314 12
## Yes 51 58
##
## Accuracy : 0.8552
## 95% CI : (0.8185, 0.8869)
## No Information Rate : 0.8391
## P-Value [Acc > NIR] : 0.1993
##
## Kappa : 0.5623
##
## Mcnemar's Test P-Value : 1.688e-06
##
## Sensitivity : 0.8286
## Specificity : 0.8603
## Pos Pred Value : 0.5321
## Neg Pred Value : 0.9632
## Prevalence : 0.1609
## Detection Rate : 0.1333
## Detection Prevalence : 0.2506
## Balanced Accuracy : 0.8444
##
## 'Positive' Class : Yes
##
prob.rf <- predict(model.rf, test, type = "prob")
preds.rf <- prediction(prob.rf[,2], test$Attrition)
roc.rf = performance(preds.rf, measure = "tpr", x.measure = "fpr")
#the balanced one definitely looks better and .25 seems a good threshold
plot(roc.perf_1, col="red")
plot(roc.rf, add = TRUE, col="blue")
Random forest wins!
Regression time
What continuous variables have a difference in distribution of the Monthly Income? pairs plots continuous
library(GGally)
employees <- read.csv("CaseStudy2-data.csv")
employees$Over18 <- NULL
#there's 22 numeric variables
employees %>%
select_if(is.numeric) %>%
dim()
## [1] 870 27
employees %>%
ggplot(aes(x=MonthlyIncome)) +
geom_histogram()
employees %>%
select_if(is.numeric) %>%
select(1:5) %>%
mutate(MonthlyIncome = employees$MonthlyIncome) %>%
sample_n(200) %>%
ggpairs() +
ggtitle("Pairs Plot")
employees %>%
select_if(is.numeric) %>%
select(6:10) %>%
mutate(MonthlyIncome = employees$MonthlyIncome) %>%
sample_n(200) %>%
ggpairs() +
ggtitle("Pairs Plot")
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
employees %>%
select_if(is.numeric) %>%
select(11:15) %>%
mutate(MonthlyIncome = employees$MonthlyIncome) %>%
sample_n(200) %>%
ggpairs() +
ggtitle("Pairs Plot")
employees %>%
select_if(is.numeric) %>%
select(16:20) %>%
mutate(MonthlyIncome = employees$MonthlyIncome) %>%
sample_n(200) %>%
ggpairs() +
ggtitle("Pairs Plot")
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
employees %>%
select_if(is.numeric) %>%
select(21:27) %>%
mutate(MonthlyIncome = employees$MonthlyIncome) %>%
sample_n(200) %>%
ggpairs() +
ggtitle("Pairs Plot")
employees$logMonthlyIncome = log(employees$MonthlyIncome)
Predicting Salary – linear Regression
str(employees)
## 'data.frame': 870 obs. of 36 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Age : int 32 40 35 32 24 27 41 37 34 34 ...
## $ Attrition : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ BusinessTravel : Factor w/ 3 levels "Non-Travel","Travel_Frequently",..: 3 3 2 3 2 2 3 3 3 2 ...
## $ DailyRate : int 117 1308 200 801 567 294 1283 309 1333 653 ...
## $ Department : Factor w/ 3 levels "Human Resources",..: 3 2 2 3 2 2 2 3 3 2 ...
## $ DistanceFromHome : int 13 14 18 1 2 10 5 10 10 10 ...
## $ Education : int 4 3 2 4 1 2 5 4 4 4 ...
## $ EducationField : Factor w/ 6 levels "Human Resources",..: 2 4 2 3 6 2 4 2 2 6 ...
## $ EmployeeCount : int 1 1 1 1 1 1 1 1 1 1 ...
## $ EmployeeNumber : int 859 1128 1412 2016 1646 733 1448 1105 1055 1597 ...
## $ EnvironmentSatisfaction : int 2 3 3 3 1 4 2 4 3 4 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 2 2 2 1 1 2 2 1 1 2 ...
## $ HourlyRate : int 73 44 60 48 32 32 90 88 87 92 ...
## $ JobInvolvement : int 3 2 3 3 3 3 4 2 3 2 ...
## $ JobLevel : int 2 5 3 3 1 3 1 2 1 2 ...
## $ JobRole : Factor w/ 9 levels "Healthcare Representative",..: 8 6 5 8 7 5 7 8 9 1 ...
## $ JobSatisfaction : int 4 3 4 4 4 1 3 4 3 3 ...
## $ MaritalStatus : Factor w/ 3 levels "Divorced","Married",..: 1 3 3 2 3 1 2 1 2 2 ...
## $ MonthlyIncome : int 4403 19626 9362 10422 3760 8793 2127 6694 2220 5063 ...
## $ MonthlyRate : int 9250 17544 19944 24032 17218 4809 5561 24223 18410 15332 ...
## $ NumCompaniesWorked : int 2 1 2 1 1 1 2 2 1 1 ...
## $ OverTime : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 1 2 2 2 1 ...
## $ PercentSalaryHike : int 11 14 11 19 13 21 12 14 19 14 ...
## $ PerformanceRating : int 3 3 3 3 3 4 3 3 3 3 ...
## $ RelationshipSatisfaction: int 3 1 3 3 3 3 1 3 4 2 ...
## $ StandardHours : int 80 80 80 80 80 80 80 80 80 80 ...
## $ StockOptionLevel : int 1 0 0 2 0 2 0 3 1 1 ...
## $ TotalWorkingYears : int 8 21 10 14 6 9 7 8 1 8 ...
## $ TrainingTimesLastYear : int 3 2 2 3 2 4 5 5 2 3 ...
## $ WorkLifeBalance : int 2 4 3 3 3 2 2 3 3 2 ...
## $ YearsAtCompany : int 5 20 2 14 6 9 4 1 1 8 ...
## $ YearsInCurrentRole : int 2 7 2 10 3 7 2 0 1 2 ...
## $ YearsSinceLastPromotion : int 0 4 2 5 1 1 0 0 0 7 ...
## $ YearsWithCurrManager : int 3 9 2 7 3 7 3 0 0 7 ...
## $ logMonthlyIncome : num 8.39 9.88 9.14 9.25 8.23 ...
model.lm <- lm(logMonthlyIncome ~ ., data=employees)
plot(model.lm)